The purpose of this project is to uncover and document valued actionable insights which are contained within the available source data for the benefit of the target audience.
The target audience includes coaches, personal trainers, athletes, ahletics governing body officials, interested members of the public, sports enthusiasts, sports statisticians and data scientists.
The objective is to explore Victorian interclub athletic competition results data for the complete 2017-18 season and identify:
Important note: This analysis is a not-for-profit independent analysis conducted by Bree McLennan, using publically available data from the Athletics Victoria Website. This analysis does not represent the opinions of Athletics Victoria.
The data for interclub rounds 1 to 12 is contained in individual csv files, by round, for each participating Victorian region.
General description of the source data:
Technical approach to creating the analysis data:
# Dataset structure
glimpse(tbl_df(wrk.03DataTrans_03))## Observations: 28,799
## Variables: 61
## $ KEYRegistrationNumber <fct> 2559, 10002, 446, 459, ...
## $ NUMDistance <int> NA, NA, NA, NA, NA, NA,...
## $ NUMStandardDistance <int> NA, NA, NA, NA, NA, NA,...
## $ NUMPerformance <dbl> 11.53, 22.93, 4.16, 4.0...
## $ NUMWindReading <dbl> NA, NA, -2.1, -1.4, 2.6...
## $ NUMPointsAwarded <int> 153, NA, 307, 283, 222,...
## $ CATAthleteRegisteredZone <fct> Ballarat, Ballarat, Bal...
## $ CATCompetitionVenue <fct> Ballarat, Ballarat, Bal...
## $ CATGender <fct> Female, Female, Female,...
## $ CATAthleteRegisteredClub <fct> BYC, WEN, BHA, WEN, BHA...
## $ CATAgeGroup <fct> F14, F14, F14, F14, F14...
## $ CATAthleteDivision <fct> 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CATAthleteTeamNumber <fct> 1, 0, 1, 1, 1, 1, 1, 1,...
## $ CATEventDiscipline <fct> Javelin, Javelin, Long ...
## $ CATDistance <fct> NA, NA, NA, NA, NA, NA,...
## $ CATStandardDistance <fct> NA, NA, NA, NA, NA, NA,...
## $ CATEventSpecification <chr> "400g", "400g", "", "",...
## $ CATEventHeat <fct> 1, 1, 1, 1, 1, 1, 1, 1,...
## $ CATEventNote <fct> 400g, 400g, INV, None, ...
## $ ORDCompetitionRound <fct> 1, 1, 1, 1, 1, 1, 1, 1,...
## $ TXTPerformanceFormatted <chr> "11.53m", "22.93m", "4....
## $ BINValidEventAttempt <fct> 1, 1, 1, 1, 1, 1, 1, 1,...
## $ BINInvitationEventOrAthlete <fct> 0, 1, 0, 0, 0, 0, 0, 0,...
## $ BINTeamEvent <fct> 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CATEventFullName <fct> Javelin 400g, Javelin 4...
## $ CATEventGroupL1 <fct> Field, Field, Field, Fi...
## $ CATEventGroupL2 <fct> Throw, Throw, Jump, Jum...
## $ CATAgeGroupLeveL1 <fct> Juniors, Juniors, Junio...
## $ CATAgeGroupKey <fct> , , , , , , , , , , , ,...
## $ BINAthleteWithDisability <fct> 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CATAthleteAWDClass <fct> NA, NA, NA, NA, NA, NA,...
## $ ORDEventFinishOrder <int> 2, 1, 1, 2, 3, 4, 5, 6,...
## $ ORDEventFinishOrderPoints <dbl> 1, 0, 1, 2, 3, 4, 5, 6,...
## $ NUMTotAthletesInVenueAgeEventPoints <dbl> 1, 1, 6, 6, 6, 6, 6, 6,...
## $ NUMTotAthletesInVenueAgeEvent <int> 2, 2, 7, 7, 7, 7, 7, 7,...
## $ ORDRoundEventFinishOrder <int> 11, 4, 10, 13, 21, 26, ...
## $ ORDRoundEventFinishOrderPoints <dbl> 11, 0, 11, 14, 21, 26, ...
## $ NUMTotAthletesInRoundAgeEventPoints <dbl> 13, 13, 32, 32, 32, 32,...
## $ NUMTotAthletesInRoundAgeEvent <int> 13, 13, 33, 33, 33, 33,...
## $ CATClubDistrict <fct> Country, Country, Count...
## $ CATClubZoneName <fct> Ballarat, Ballarat, Bal...
## $ CATAthleticClubName <fct> Ballarat YCW Harriers, ...
## $ CATAthleticTrackVenueName <fct> Llanberris Athletic Res...
## $ CATVenueTrackType <fct> Synthetic, Synthetic, S...
## $ CATVenueAddress <chr> "York St, Golden Point,...
## $ CATVenueDistrict <fct> Country, Country, Count...
## $ CATVenueMapCoord <chr> "-37.5693937,143.865968...
## $ NUMVenueLatitude <dbl> -37.56939, -37.56939, -...
## $ NUMVenueLongitude <dbl> 143.866, 143.866, 143.8...
## $ CATVenueZone <fct> Ballarat, Ballarat, Bal...
## $ CATVenueVictoriaRegion <fct> Ballarat, Ballarat, Bal...
## $ NUMAdjustFactorAWD <dbl> NA, NA, NA, NA, NA, NA,...
## $ NUMAdjustFactorVET <dbl> NA, NA, NA, NA, NA, NA,...
## $ BINAthleteCompeteAwayVenue <fct> 1, 1, 1, 1, 1, 1, 1, 1,...
## $ CATAthleteEventStatus <fct> OK, OK, OK, OK, OK, OK,...
## $ NUMPerformanceAWDAdjusted <dbl> 11.53, 22.93, 4.16, 4.0...
## $ ORDRoundEventFinishOrderAWDAdj <int> 11, 4, 10, 13, 21, 26, ...
## $ ORDRoundEventFinishOrderPointsAWDAdj <dbl> 11, 0, 11, 14, 21, 26, ...
## $ NUMEventFinishOrderPoints11 <dbl> 11, 0, 11, 9, 8, 7, 6, ...
## $ NUMRoundEventFinishOrderPoints11 <dbl> 1, 0, 1, 1, 1, 1, 1, 1,...
## $ NUMRoundEventFinishOrderPoints11AWDAdj <dbl> 1, 0, 1, 1, 1, 1, 1, 1,...
# Randomly sample 6 rows from the analysis dataset
head(wrk.03DataTrans_03[sample(nrow(wrk.03DataTrans_03))])# Participation rates for each round
wrk.03DataTrans_Q1A <- wrk.03DataTrans_03 %>%
filter(KEYRegistrationNumber %ni% c("0")) %>% #remove teams
group_by(ORDCompetitionRound) %>%
summarise(NUMAthletesParticipating = n_distinct(KEYRegistrationNumber),
NUMTotalEventsParticipated = n())
barplot(rev(wrk.03DataTrans_Q1A$NUMAthletesParticipating),
main = "Athlete participation by round",
col = rgb(0.2,0.4,0.6,0.6), horiz = TRUE , las = 1 ,
xlab = "Number of athletes",
names.arg = rev(wrk.03DataTrans_Q1A$ORDCompetitionRound))wrk.03DataTrans_Q1A# Distribution of participation across rounds
# Calculate participation by athlete
FreqTable <- as.data.table(xtabs(~ KEYRegistrationNumber + ORDCompetitionRound, wrk.03DataTrans_03))
FreqTableCast <- dcast.data.table(FreqTable, KEYRegistrationNumber ~ as.numeric(ORDCompetitionRound), value.var = "N")
FreqTableCast_1 <- FreqTableCast %>%
mutate(NUMTotalEventsPartipated = rowSums(FreqTableCast[, c(2:12)])) %>%
mutate(NUMTotalRoundsParticipated = apply(FreqTableCast[, c(2:12)], 1, function(a) sum(a > 0)) )
head(FreqTableCast_1)FreqTableCast_2 <- FreqTableCast_1 %>%
filter(KEYRegistrationNumber %ni% c("0")) #not including teams
plot(FreqTableCast_2$NUMTotalRoundsParticipated, FreqTableCast_2$NUMTotalEventsPartipated )# Athletes competing in all rounds of competition
wrk.03DataTrans_Q1B <- wrk.03DataTrans_03 %>%
filter(KEYRegistrationNumber %ni% c("0")) %>% #remove teams
group_by(KEYRegistrationNumber) %>%
summarise(NUMAthletesRounds = n_distinct(ORDCompetitionRound)) %>%
filter(NUMAthletesRounds >= 11) %>%
summarise(NUMTotalAthletesAllRounds = n(),
NUMRounds = max(NUMAthletesRounds))
wrk.03DataTrans_Q1B# How many athletes competed at away venues?
wrk.03DataTrans_Q1C <- wrk.03DataTrans_03 %>%
filter(KEYRegistrationNumber %ni% c("0") & BINAthleteCompeteAwayVenue == 1) %>% #remove teams
group_by(BINAthleteCompeteAwayVenue) %>%
summarise(NUMAthletesAway = n_distinct(KEYRegistrationNumber))
wrk.03DataTrans_Q1C# Participation by event
wrk.03DataTrans_Q2A <- wrk.03DataTrans_03 %>%
group_by(ORDCompetitionRound, CATEventFullName) %>%
summarise(NUMAthletesParticipating = n_distinct(KEYRegistrationNumber),
NUMTotalEventsParticipated = n())
wrk.03DataTrans_Q2A# The most popular events TODO:FIX
wrk.03DataTrans_Q2B <- wrk.03DataTrans_Q2A %>%
group_by(CATEventFullName) %>%
summarise(NUMTotalEventsParticipated = n()) %>%
arrange(desc(NUMTotalEventsParticipated))
wrk.03DataTrans_Q2B# incomplete events & invalid attempts
wrk.03DataTrans_Q3A <- wrk.03DataTrans_03 %>%
filter(BINValidEventAttempt == 0 & CATAthleteEventStatus %ni% c("OK")) %>%
group_by(CATEventFullName, CATAthleteEventStatus) %>%
summarise(NUMEventStatus = n())
wrk.03DataTrans_Q3A# Lets take an event like 400m. Circular event, wind readings not required. Hypothesis: Good measure of speed endurance & fitness
wrk.03DataTrans_Q4A <- wrk.03DataTrans_03 %>%
filter(CATEventFullName == "400 Run") %>%
select(ORDCompetitionRound, CATEventFullName, NUMPerformance, CATGender) #CATAgeGroup)
#group_by(ORDCompetitionRound, CATEventFullName) %>%
#summarise(NUMAvgPerformance = mean(NUMPerformance))
library(ggplot2)
ggplot(wrk.03DataTrans_Q4A, aes(x = ORDCompetitionRound, y = NUMPerformance, fill = CATGender)) +
geom_boxplot() +
scale_fill_brewer(palette = "Spectral")## Warning: Removed 109 rows containing non-finite values (stat_boxplot).
head(wrk.03DataTrans_Q4A)# Lets look at a power event like shot put
wrk.03DataTrans_Q4B <- wrk.03DataTrans_03 %>%
filter(CATEventDiscipline == "Shot Put") %>%
select(ORDCompetitionRound, CATEventFullName, CATEventDiscipline, NUMPerformance, CATGender)
ggplot(wrk.03DataTrans_Q4B, aes(x = ORDCompetitionRound, y = NUMPerformance, fill = CATGender)) +
geom_boxplot() +
scale_fill_brewer(palette = "Spectral")## Warning: Removed 46 rows containing non-finite values (stat_boxplot).
# Setup data to create map plot
wrk.03DataTrans_PlotMap <- wrk.03DataTrans_03 %>%
filter(KEYRegistrationNumber %ni% c("0")) %>% #remove teams
group_by(ORDCompetitionRound, CATCompetitionVenue, NUMVenueLatitude, NUMVenueLongitude) %>%
summarise(NUMAthletes_RV = n_distinct(KEYRegistrationNumber),
NUMEventsParticipated_RV = n(),
NUMAvgWindReading_RV = round(mean(!is.na(as.numeric(NUMWindReading))), digits = 3)
)
# Create a map and plot all venues used during the season
leaflet(data = wrk.03DataTrans_PlotMap) %>%
addTiles() %>%
addMarkers(~NUMVenueLongitude, ~NUMVenueLatitude, popup = ~as.character(CATCompetitionVenue), label = ~as.character(CATCompetitionVenue))# The windiest venue
wrk.03DataTrans_PlotMap <- arrange(wrk.03DataTrans_PlotMap, desc(NUMAvgWindReading_RV))
wrk.03DataTrans_PlotMap# Setup calculated fields
wrk.03DataTrans_SUMM01A <- wrk.03DataTrans_03 %>%
filter(KEYRegistrationNumber %ni% c("0")) %>% #remove teams
group_by(KEYRegistrationNumber, CATAgeGroup, CATAthleticClubName, CATClubZoneName) %>%
summarise(NUMTotalPoints11 = sum(NUMEventFinishOrderPoints11, na.rm = TRUE),
NUMTotalAVPointsAwarded = sum(NUMPointsAwarded, na.rm = TRUE),
NUMTotalRoundPoints11 = sum(NUMRoundEventFinishOrderPoints11, na.rm = TRUE),
NUMTotalRoundPoints11AWDAdj = sum(NUMRoundEventFinishOrderPoints11AWDAdj, na.rm = TRUE)) %>%
arrange(desc(NUMTotalPoints11))
par(mfrow = c(4,1))
# BY ATHLETE / REGISTRATION ID
# Horizontal Barplot [Method: 1st=11, 2-9=11minus finish order, >10 = 1 point, event/venue based]:
barplot(rev(wrk.03DataTrans_SUMM01A$NUMTotalPoints11[1:10]),
main = "AV Shield 2017-18: Points awarded [1st=11]",
col = rgb(0.2,0.4,0.6,0.6), horiz = TRUE , las = 1 ,
xlab = "Total Points Awarded [11]",
names.arg = rev(wrk.03DataTrans_SUMM01A$KEYRegistrationNumber[1:10]))
head(wrk.03DataTrans_SUMM01A, n = 10) # Horizontal Barplot [Method Decathlon WR, AV]:
wrk.03DataTrans_SUMM01B <- arrange(wrk.03DataTrans_SUMM01A, desc(NUMTotalAVPointsAwarded))
barplot(rev(wrk.03DataTrans_SUMM01B$NUMTotalAVPointsAwarded[1:10]),
main = "AV Shield 2017-18: Points awarded [AV Shield]",
col = rgb(0.2,0.4,0.6,0.6), horiz = TRUE , las = 1 ,
xlab = "Total Points Awarded [AV Shield]",
names.arg = rev(wrk.03DataTrans_SUMM01B$KEYRegistrationNumber[1:10]))
head(wrk.03DataTrans_SUMM01B, n = 10) # Horizontal Barplot [Method 1st=11, 2-9=11minus finish order, >10 = 1 point, by Round]:
wrk.03DataTrans_SUMM01C <- arrange(wrk.03DataTrans_SUMM01A, desc(NUMTotalRoundPoints11))
barplot(rev(wrk.03DataTrans_SUMM01C$NUMTotalRoundPoints11[1:10]),
main = "AV Shield 2017-18: Points awarded [1st=11, round]",
col = rgb(0.2,0.4,0.6,0.6), horiz = TRUE , las = 1 ,
xlab = "Total Points Awarded [11 by round]",
names.arg = rev(wrk.03DataTrans_SUMM01C$NUMTotalRoundPoints11[1:10]))
head(wrk.03DataTrans_SUMM01C, n = 10) # Horizontal Barplot [Method: 1st=11, 2-9=11minus finish order, >10 = 1
# point, with AWD performance Adjust by Round]:
wrk.03DataTrans_SUMM01D <- arrange(wrk.03DataTrans_SUMM01A, desc(NUMTotalRoundPoints11AWDAdj))
barplot(rev(wrk.03DataTrans_SUMM01D$NUMTotalRoundPoints11AWDAdj[1:10]),
main = "AV Shield 2017-18: Points awarded [AWD Adjust,1st=11, round]",
col = rgb(0.2,0.4,0.6,0.6), horiz = TRUE , las = 1 ,
xlab = "Total Points Awarded [AWD Adjust & 11 by round]",
names.arg = rev(wrk.03DataTrans_SUMM01D$NUMTotalRoundPoints11AWDAdj[1:10])) head(wrk.03DataTrans_SUMM01D, n = 10)TODO: add in lessons learned, traps for young players